#This script generates the synonyms for opposition for robustness checks with expanded axis of opposition
#It uses a pretrained GloVe model
#Relevant terms were then manually coded into oppterms_manual.csv and supterms_manual.csv
#as indicated at the bottom of the script
from gensim.models import KeyedVectors
import os
import pandas as pd

input_path = 'data/pretrained_embedding/alcembeddings/gloVe_ar/glove_vectors_arwiki.txt'
output_path = 'data/pretrained_embedding/alcembeddings/gloVe_ar/glove_vectors_arwiki_with_header.txt'

# Determine vocab_size and vector_size
with open(input_path, 'r', encoding='utf-8') as f:
    first_line = f.readline().split()
    vector_size = len(first_line) - 1
    vocab_size = 1 + sum(1 for _ in f)

# Write the new file with header
with open(output_path, 'w', encoding='utf-8') as f_out:
    f_out.write(f"{vocab_size} {vector_size}\n")
    with open(input_path, 'r', encoding='utf-8') as f_in:
        for line in f_in:
            f_out.write(line)

# Then load the corrected file
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format(output_path, binary=False)

def get_synonyms(word, topn=100):
    return [item[0] for item in model.most_similar(word, topn=topn)]

word1_synonyms = get_synonyms('المعارضة')
word2_synonyms = get_synonyms('الدعم')      

oppterms = pd.DataFrame(word1_synonyms)
supterms = pd.DataFrame(word2_synonyms)

oppterms.to_csv('data/pretrained_embedding/oppterms.csv', index=False, encoding='utf-8-sig')
supterms.to_csv('data/pretrained_embedding/supterms.csv', index=False, encoding='utf-8-sig')

# then manually code into oppterms_manual.csv and supterms_manual.csv